#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-InitialExperiments/SFARI_genes/R_Markdowns/')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis)
library(biomaRt)
library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load raw count expression matrix and pubmed counts by gene

datExpr = read.csv('./../../FirstYearReview/Data/Gandal/RNAseq_ASD_datExpr.csv')
pubmed_counts = read.csv('./../Data/pubmed_count_by_gene.csv')

Convert ensembl IDs to gene names

# Get gene names from Ensembl IDs
getinfo = c('ensembl_gene_id','external_gene_id','gene_biotype')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',  dataset='hsapiens_gene_ensembl', 
               host='feb2014.archive.ensembl.org')
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=datExpr$X, mart=mart)
datGenes = datGenes[match(datExpr$X, datGenes$ensembl_gene_id),]

rm(getinfo, mart)
datGenes = datGenes %>% mutate('meanExpr' = rowMeans(datExpr[,-1])) %>% 
           left_join(pubmed_counts, by=c('external_gene_id'='Gene')) %>% na.omit

*Perhaps later I should remove the entries with gene name 5s_rRNA

Gene Expression

Very heavy right tail

ggplotly(datGenes %>% ggplot(aes(meanExpr+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
         scale_x_log10() + theme_minimal())

Number of Publications

Very heavy right tail too

summary(datGenes$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0     723      10 7711933
datGenes %>% arrange(desc(Count)) %>% distinct(external_gene_id, .keep_all=TRUE) %>% 
             head(20) %>% kable(caption='Top 20 genes by number of publications')
Top 20 genes by number of publications
ensembl_gene_id external_gene_id gene_biotype meanExpr Count
ENSG00000180176 TH protein_coding 5.4090909 7711933
ENSG00000177606 JUN protein_coding 1970.3750000 2604914
ENSG00000136999 NOV protein_coding 348.3522727 2332690
ENSG00000228491 MICE pseudogene 0.0000000 1634646
ENSG00000173599 PC protein_coding 803.6136364 1464096
ENSG00000133424 LARGE protein_coding 1544.2727273 1393404
ENSG00000164458 T protein_coding 0.3522727 1215682
ENSG00000154059 IMPACT protein_coding 292.6931818 910176
ENSG00000087085 ACHE protein_coding 234.3409091 818059
ENSG00000119335 SET protein_coding 1844.2500000 502402
ENSG00000062485 CS protein_coding 973.9204545 407959
ENSG00000087111 PIGS protein_coding 375.0795455 374551
ENSG00000115718 PROC protein_coding 3.9431818 359016
ENSG00000168453 HR protein_coding 701.6818182 256256
ENSG00000175084 DES protein_coding 31.4772727 217417
ENSG00000105976 MET protein_coding 780.5568182 186947
ENSG00000188158 NHS protein_coding 572.5681818 169222
ENSG00000204490 TNF protein_coding 0.0000000 167834
ENSG00000169083 AR protein_coding 163.5568182 166496
ENSG00000010610 CD4 protein_coding 204.3068182 164440
ggplotly(datGenes %>% ggplot(aes(Count+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
         scale_x_log10() + theme_minimal())

Relation between publications and mean expression

There is a positive relation between these two variables and the correlation is quite high but only when transforming to logarithmic scale the variables

datGenes %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(color='#0099cc', alpha=0.1) +
             geom_smooth(color='gray', alpha=0.3) + scale_x_log10() + scale_y_log10() + 
             ggtitle(paste0('Cor=(' ,cor(log10(datGenes$meanExpr+1), log10(datGenes$Count+1)),')')) + theme_minimal()

Differentiating protein coding genes from the rest, the correlation decreases when separating the genes (I would have expected the opposite)

*The red stripes on the plot correspond to non coding genes that have many ensembl IDs, like Y_RNA, snoU13, U3, or SNORD112

datGenes %>% mutate(protein_coding = gene_biotype=='protein_coding') %>% 
             ggplot(aes(meanExpr+1, Count+1, color=protein_coding)) + 
             geom_point(alpha=0.1) + geom_smooth() + 
             scale_x_log10() + scale_y_log10() + 
             ggtitle(paste0('CorPC=(',
                            cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype=='protein_coding'],
                                log10(datGenes$Count+1)[datGenes$gene_biotype=='protein_coding']),'), ',
                            'CorNPC=(',
                            cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype!='protein_coding'],
                                log10(datGenes$Count+1)[datGenes$gene_biotype!='protein_coding']),')')) +
             theme_minimal()